package org.yinxue.spider.core.util;

/**
 * UrlUtils {@link org.yinxue.spider.core.util}
 *
 * @author zengjian
 * @date 2019/2/17
 * @since 1.0.0
 */
public abstract class UrlUtils {

    public static final String SCHEME_HTTP = "http://";
    public static final String SCHEME_HTTPS = "https://";

    public static boolean isAbsoluteUrl(String hrefUrl) {
        return StringUtils.startsWith(hrefUrl, SCHEME_HTTP, SCHEME_HTTPS);
    }

    public static boolean isRelationUrl(String hrefUrl) {
        return !isAbsoluteUrl(hrefUrl);
    }

    /**
     * http://www.xxx.com/index.html -> http://www.xxx.com
     *
     * @param srcUrl
     * @return
     */
    public static String parseBaseUrl(String srcUrl) {
        // BASEURL需要解析出该URL中域名结尾的部分哦
        // 去掉http:// 或者 https://之后第一个斜线/为止或者是末尾
        String baseUrl = srcUrl;
        String scheme = SCHEME_HTTP;
        if (srcUrl.startsWith(SCHEME_HTTP)) {
            baseUrl = srcUrl.replace(SCHEME_HTTP, "");
        } else if (srcUrl.startsWith(SCHEME_HTTPS)) {
            baseUrl = srcUrl.replace(SCHEME_HTTPS, "");
            scheme = SCHEME_HTTPS;
        }
        // 包含/，截取到第一个斜线，否则保持原样
        if (baseUrl.contains("/")) {
            baseUrl = baseUrl.substring(0, baseUrl.indexOf('/'));
        }
        return scheme + baseUrl;
    }


    /**
     * 合并相对路径 <br>
     * 1. /cn/reginit.doAction -> http://www.infoq.com/cn/reginit.doAction
     * 2. /privacypolicy -> http://www.infoq.com/cn/privacypolicy
     * 3. www.infoq.com/cn/reginit.doAction -> http://www.infoq.com/cn/reginit.doAction
     * 4. ./yyy.html  -> http://www.xxx.com/yyy.html
     *
     * @param hrefUrl
     * @param baseUrl 上下文路径
     * @return
     */
    public static String parseAbsoluteUrl(String hrefUrl, String baseUrl) {
        if (StringUtils.isAnyEmpty(hrefUrl, baseUrl)) {
            return baseUrl;
        }

        // 如果hrefUrl以 // 开头，如：//v.youku.com/v_show/id_XNDA1MDE0NzAwNA==.html，直接加Http前缀
        if (hrefUrl.startsWith("//")) {
            return "http:" + hrefUrl;
        }

        // 如果是绝对地址，直接返回
        if (isAbsoluteUrl(hrefUrl)) {
            return hrefUrl;
        }
        // TODO 尝试将目标地址解析为源地址
//         种子url如果不是baseUrl需要先解析
//        String baseUrl = parseBaseUrl(srcUrl);

        // 如果hrefUrl以 .开头，意味着直接替换为srcUrl
        if (hrefUrl.startsWith(".")) {
            // 替换第一个 . 号
            return hrefUrl.replaceFirst("\\.", baseUrl);
        }

        // 计算重叠部分，并且进行去重处理，
        int count = 0;
        int baseLength = baseUrl.length();
        int urlLength = hrefUrl.length();
        if (urlLength <= baseLength) {
            int start = baseLength - urlLength;
            while (true) {
                if (!baseUrl.substring(start + count).equals(hrefUrl.substring(0, urlLength - count))) {
                    count++;
                    if (count == urlLength || count == baseLength) {
                        break;
                    }
                } else {
                    break;
                }
            }
            // 如果相对路径不是以 / 开头
            String newHrefUrl = count == urlLength ? hrefUrl : hrefUrl.substring(urlLength - count);
            if (!newHrefUrl.startsWith("/")) newHrefUrl = "/" + newHrefUrl;
            return baseUrl + newHrefUrl;
        } else {
            while (true) {
                if (!baseUrl.substring(count).equals(hrefUrl.substring(0, baseLength - count))) {
                    count++;
                    if (count == baseLength) {
                        break;
                    }
                } else {
                    break;
                }
            }
            String newBaseUrl = count == baseLength ? baseUrl : baseUrl.substring(0, count);
            if (!hrefUrl.startsWith("/")) {
                hrefUrl = "/" + hrefUrl;
            }
            return newBaseUrl + hrefUrl;
        }
    }

}